#This file produces images for the paper
rm(list=ls())
setwd("~/Documents/Research Projects/Cohort Prediction/Data/")
library(dplyr)
library(caret)      #wrapper model training
library(ranger)     #Random Forests
library(plyr)
library(glmnet)     #Lasso packages
library(glmnetUtils)

#Data Prep----
#Set Path1 to be the path to factor_data_criminal_history
#Set Path2 to be te path to one_hot_data_criminal_history
factor_data = read.csv(Path1, stringsAsFactors = TRUE)%>%
  select(-c("X"))

factor_data = factor_data %>%
  mutate(sp_ethn_aw = relevel(sp_ethn_aw, ref = "White and Other"),
         pcimgen_rev = relevel(pcimgen_rev, ref = "3rd generation"),
         marital_status = relevel(marital_status, ref = "Married"),
         arrested17_18 = relevel(arrested17_18, ref = "never_arrested"),
         arrested19_24 = relevel(arrested19_24, ref = "never_arrested"))

factor_train = factor_data%>%
  filter(older_cohort)%>%
  select(-c("older_cohort"))
factor_test = factor_data%>%
  filter(!older_cohort)%>%
  select(-c("older_cohort"))

one_hot_data = read.csv(file = Path2, stringsAsFactors = TRUE) %>%
  select(-c("X"))

train = one_hot_data%>%
  filter(older_cohort)%>%
  select(-c("older_cohort"))
test = one_hot_data%>%
  filter(!older_cohort)%>%
  select(-c("older_cohort"))

#Random Forest Model ---------
set.seed(42)
model = ranger(arrested19_24 ~., data=factor_train, probability = TRUE, respect.unordered.factors = TRUE, importance = "impurity", splitrule = "extratrees")

set.seed(42)
model_young = ranger(arrested19_24 ~., data=factor_test, probability = TRUE, respect.unordered.factors = TRUE, importance = "impurity", splitrule = "extratrees")


train_probs = model$predictions[,"arrested"]
test_probs = predict(model, type = "response", data = factor_test)$predictions[,"arrested"]
young_probs = model_young$predictions[,"arrested"]

factor_train$pnas_rf = train_probs
factor_test$pnas_rf = test_probs
factor_test$pnas_rf_yy = young_probs

#LASSO Model ---------
set.seed(42)
model = cv.glmnet(arrested19_24 ~ ., 
                  data = train, 
                  alpha = 1,
                  nfolds = 10,
                  keep = TRUE,
                  family = "binomial")

set.seed(42)
model_young = cv.glmnet(arrested19_24 ~ ., 
                        data = test, 
                        alpha = 1,
                        nfolds = 10,
                        keep = TRUE,
                        family = "binomial")


lambda_min = model$lambda.min
train_probs = (model$fit.preval[,model$lambda==lambda_min] %>%
                 plogis())
test_probs = (predict(model, type = "response", s = lambda_min, newdata = test)) %>%
  as.numeric()
lambda_young = model_young$lambda.min
young_probs =(model_young$fit.preval[,model_young$lambda==lambda_young] %>%
                plogis())

factor_train$pnas_lasso = train_probs
factor_test$pnas_lasso = test_probs
factor_test$pnas_lasso_yy = young_probs

#Ridge Model ---------
set.seed(42)
model = cv.glmnet(arrested19_24 ~ ., 
                  data = train, 
                  alpha = 0,
                  nfolds = 10,
                  keep = TRUE,
                  family = "binomial")

set.seed(42)
model_young = cv.glmnet(arrested19_24 ~ ., 
                        data = test, 
                        alpha = 0,
                        nfolds = 10,
                        keep = TRUE,
                        family = "binomial")


lambda_min = model$lambda.min
train_probs = (model$fit.preval[,model$lambda==lambda_min] %>%
                 plogis())
test_probs = (predict(model, type = "response", s = lambda_min, newdata = test)) %>%
  as.numeric()
lambda_young = model_young$lambda.min
young_probs =(model_young$fit.preval[,model_young$lambda==lambda_young] %>%
                plogis())

factor_train$pnas_ridge = train_probs
factor_test$pnas_ridge = test_probs
factor_test$pnas_ridge_yy = young_probs

#Logistic Regression Model ---------
train_control = trainControl(method = "cv", number = 10, savePredictions = "final", classProbs = TRUE)

set.seed(42)
model = train(arrested19_24 ~., 
              data = train,
              trControl = train_control,
              method = "glm",
              family = binomial(),
              maxit = 50)

set.seed(42)
model_young = train(arrested19_24 ~., 
                    data = test,
                    trControl = train_control,
                    method = "glm",
                    family = binomial(),
                    maxit = 50)


train_probs = model$pred[order(model$pred$rowIndex),]$arrested
test_probs = predict(model, newdata = test, type = "prob")$arrested
young_probs = model_young$pred[order(model_young$pred$rowIndex),]$arrested

factor_train$pnas_lr = train_probs
factor_test$pnas_lr = test_probs
factor_test$pnas_lr_yy = young_probs

#Export Results
write.csv(factor_train, "older_cohort_binCH_results.csv")
write.csv(factor_test, "younger_cohort_binCH_results.csv")